*03_clean_mortality_data.do

capture log close
clear
set linesize 255

global root =  "/disk/bulkw/nencka/schooling_pandemic/2021_10_18_final/"
global input   "$root/Input"
global scripts "$root/Scripts"
global output  "$root/Output"
global temp    "$root/Temp"
global output  "$root/Output/predictors"
global log     "$root/Log"


log using "$log/03_clean_mortality_data", replace text


*****************************************************************************
*****************************************************************************
*****************************************************************************

*Load and process mortality data

	use "$input/flu_mortality_data.dta", clear
	desc, fullnames

	 *Run a series of city-level regressions where we regress ln(influenza deaths) on a city-specific linear time trend

	replace influenza = 0 if mi(influenza)
	sum influenza, d

	gen ln_influenza_deaths = ln(influenza+1) 


	gen count = 1 if year < 1918 
	bysort state_city_c: egen sum_pre_1918_obs = sum(count)

	*Keep cities with >= 4 pre-periods
	keep if sum_pre_1918_obs == 4 | sum_pre_1918_obs == 5
	sum influenza, d

	*City-specifc trends prediction
	qui: reg ln_influenza_deaths c.year##i.state_city_c if year < 1918

	*generate predicted values
	predict p_influenza_deaths 

	*generate counterfactual values in levels
	replace p_influenza_deaths = exp(p_influenza_deaths)


	*Calculate ratio of actual influenza deaths to predicted deaths
	gen excess_death_ratio = influenza/p_influenza_deaths


	*Keep 1918
	keep if year == 1918 
	drop if pop_total < 0

	replace city = lower(city)

	rename city mcd
	rename state_fips statefip


	*Clean up names

	replace mcd = subinstr(mcd, " borough", "", .)
	replace mcd = subinstr(mcd, " town", "", .)


	replace mcd = trim(mcd)

	keep statefip mcd excess_death_ratio influenza
	sort statefip mcd


	save "$temp/flu_mortality.dta", replace

log close

